knitr::opts_chunk$set(echo = TRUE)
if (knitr::is_latex_output()) {
MODE <- "PDF"
} else if (knitr::is_html_output()) {
MODE <- "HTML"
} else {
MODE <- "OTHER"
}
print(MODE)
## [1] "HTML"
load all packages required
library(cowplot)
library(ggforce)
library(ggpubr)
library(ggrepel)
library(knitr)
library(paletteer)
library(ggalt)
library(plotly)
library(ggbeeswarm)
library(scico)
library(cividis)
library(ggrastr)
library(tools)
library(mgcv) # For fitting GAM model
library(h2o)
library(Metrics)
library(zoo)
library(scam)
library(DescTools)
library(tidyverse)
# library(PCAtools)
# library(gridExtra)
# library(forcats)
library(ggrastr)
# library(DescTools)
theme_set(theme_cowplot(12))
############################################################################################### ############################################################################################### ############################################################################################### # #################### apply UTR model to piRNA clusters ################################### ############################################################################################### ############################################################################################### ###############################################################################################
#load data
X= read_tsv("tiles.100_-600_750_quantified.allLIBs.txt")
## Rows: 82323 Columns: 223
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): CHR, FBgn, STRAND, TEorientation, TEorientation_nucTILE
## dbl (218): START, STOP, X, thickSTART, thickSTOP, nPOS, RNAseq_RPKM, TTseq_R...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
VARI="sRNAdata_average_WT"
NUCclass="diNUC_"
# filter data
TABLEfilt_orig= X %>%
filter(!str_detect(FBgn,"GL"))%>%
filter(str_detect(FBgn,"UTR")|str_detect(FBgn,"CLUSTER")|str_detect(FBgn,"CDS"))%>%
select(-contains("ARMI"))%>%
mutate(
TYPE = case_when(
str_detect(FBgn,"UTR") ~ "UTR",
str_detect(FBgn,"CDS") ~ "CDS",
str_detect(FBgn,"CLUSTER") ~ "CLUSTER",
TRUE ~ "Other"
)
)%>%
separate(FBgn, c("CLUSTERname","tilePOS"), sep=":!:", remove=FALSE,convert=TRUE)%>%
separate(CLUSTERname, c("CLUSTERname"), sep="_", remove=TRUE,convert=TRUE)%>%
separate(CLUSTERname, c("clusterID","POS"), sep="-", remove=FALSE,convert=TRUE)%>%
mutate(POS = case_when(
TYPE == "CLUSTER" ~(POS-1)*500+(tilePOS-1)*100,
TRUE ~ tilePOS ))%>%
arrange(POS)%>%
filter(POS<600000)%>%
rename(sRNAdata_average_WT = `sRNAdata_average-WT_CLUSTERuniq`,
sRNAdata_average_YB = `sRNAdata_average-YB_CLUSTERuniq`)%>%
{}
## Warning: Expected 1 pieces. Additional pieces discarded in 78196 rows [1, 2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 75262 rows [438, 439,
## 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455,
## 456, 457, ...].
#log-transform
TABLEfilt_X <- TABLEfilt_orig %>%
filter(!is.na(sRNAdata_average_WT),!is.na(sRNAdata_average_WT),!is.infinite(sRNAdata_average_WT), sRNAdata_average_WT>0.001)%>%
select( starts_with(NUCclass),CHR,FBgn, VARI,sRNAdata_average_WT,sRNAdata_average_YB, RNAseq_RPKM,TYPE,TTseq_RPKM,nPOS,fullLengthUNIQ,UNIQinclCLUSTERuniq,UNIQ,TEorientation,TEorientation_nucTILE, POS)%>%
{}
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(VARI)
##
## # Now:
## data %>% select(all_of(VARI))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
TABLEfilt_CLUSTER = TABLEfilt_X %>%
mutate(nPOS = nPOS+20,
UTR_LENGTH=100000)%>%
filter(str_detect(FBgn,"flam") )%>%
select(CHR,FBgn, VARI,sRNAdata_average_WT, RNAseq_RPKM,TYPE, starts_with(NUCclass),TTseq_RPKM,nPOS,fullLengthUNIQ,UNIQinclCLUSTERuniq,UNIQ,TEorientation,TEorientation_nucTILE,POS )%>%
{}
TABLEfilt_Y = TABLEfilt_orig %>%
mutate(
across(
starts_with(c("sRNAdata_")),
~.x / TTseq_RPKM,
.names = "TTnorm_{.col}"
),
across(
starts_with(c("sRNAdata_")),
~.x / RNAseq_RPKM,
.names = "RNAnorm_{.col}"
),
across(
starts_with(c("CLIP_")),
~.x / RNAseq_RPKM,
.names = "RNAnorm_{.col}"
)
)%>%
filter(POS<600000)%>%
mutate(
BIN = as.factor(cut_interval(log10(TTnorm_sRNAdata_average_WT), n=5, labels = FALSE))
) %>%
filter(! is.na(BIN))%>%
{}
# Get bin breakpoints
bin_breaks <- TABLEfilt_Y %>%
group_by(BIN) %>%
summarise(minDATA = min(TTnorm_sRNAdata_average_WT, na.rm = TRUE)) %>%
pull(minDATA)
# Remove first bin (we don’t need a line at the lowest boundary)
bin_breaks <- sort(bin_breaks[-1])
TABLEfilt_Y %>%
filter(UNIQinclCLUSTERuniq > 50 , sRNAdata_average_WT>0,RNAseq_RPKM>0, TTseq_RPKM>0)%>%
mutate(
dotSIZE = case_when(
TYPE == "CLUSTER" ~ 1.5,
TRUE ~ 0.01
)
)%>%
mutate(
TYPE = factor(TYPE, levels = c("CDS","UTR","CLUSTER"))
)%>%
ggplot(aes(x=TYPE,y=TTnorm_sRNAdata_average_WT,colour = TYPE))+
# geom_point()+
geom_quasirandom_rast( aes(size=dotSIZE), alpha=1, varwidth = TRUE, groupOnX = TRUE)+
geom_hline(yintercept = bin_breaks, linetype = "dashed", color = "red", size = 1) +
annotate("text", x = 1.4, y = bin_breaks,
label = round(bin_breaks, 3), hjust = 1, vjust = 2, color = "red") +
stat_summary(
fun = median,
geom = "crossbar",
width = 0.5,
fatten = 1.5,
color = "red"
) +
stat_summary(
fun = median,
geom = "text",
aes(label = sprintf("%.2f", after_stat(y))),
vjust = -0.5,
size = 3,
color = "red"
)+
scale_y_log10(limits=c(0.0001,3500))+
# scale_color_igv()+
scale_color_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "CLUSTER" = "#e6a025" )) + # Add this line
scale_size_identity()+
coord_cartesian(clip = "off")+
labs(
y = "piRNA efficiency (sRNA / RNAseq)"
)+
annotation_logticks(sides = "l", outside=FALSE) +
theme_cowplot(12)+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank()
)+
{}
ggsave("flamenco_vs_UTR_vs_CDS_piRNA_efficiency.pdf", width = 4, height = 5)
TABLEfilt_Y %>%
filter(UNIQinclCLUSTERuniq > 50 , sRNAdata_average_WT>0,RNAseq_RPKM>0, TTseq_RPKM>0)%>%
mutate(
dotSIZE = case_when(
TYPE == "CLUSTER" ~ 1.5,
TRUE ~ 0.01
)
)%>%
mutate(
TYPE = factor(TYPE, levels = c("CDS","UTR","CLUSTER"))
)%>%
ggplot(aes(x=TYPE,y=TTnorm_sRNAdata_average_YB,colour = TYPE))+
# geom_point()+
geom_quasirandom_rast( aes(size=dotSIZE), alpha=1, varwidth = TRUE, groupOnX = TRUE)+
scale_y_log10(limits=c(0.0001,3500))+
# scale_color_igv()+
scale_color_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "CLUSTER" = "#e6a025" )) + # Add this line
scale_size_identity()+
coord_cartesian(clip = "off")+
labs(
y = "piRNA efficiency (sRNA / RNAseq)"
)+
stat_summary(
fun = median,
geom = "crossbar",
width = 0.5,
fatten = 1.5,
color = "red"
) +
stat_summary(
fun = median,
geom = "text",
aes(label = sprintf("%.2f", after_stat(y))),
vjust = -0.5,
size = 3,
color = "red"
)+
annotation_logticks(sides = "l", outside=FALSE) +
theme_cowplot(12)+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank()
)+
{}
ggsave("flamenco_vs_UTR_vs_CDS_piRNA_efficiency.YB.pdf", width = 4, height = 5)
TABLEfilt_Y %>%
filter(UNIQinclCLUSTERuniq > 50 ,RNAseq_RPKM>0, TTseq_RPKM>0)%>%
pivot_longer(cols=starts_with("diNUC_TT"), names_to = "NUC", values_to = "VAL")%>%
mutate(
TYPE = factor(TYPE, levels = c("CDS","UTR","CLUSTER")),
BIN= as.numeric(BIN)
)%>%
ggplot(aes(y=VAL))+
geom_quasirandom_rast(
data = . %>% filter(TYPE == "UTR"),
aes(x = 1+0 + 0.2, color = TYPE),
width = 0.3,
varwidth = FALSE,
groupOnX = FALSE,
size = 0.01,
alpha = 1
) +
geom_quasirandom_rast(
data = . %>% filter(TYPE == "CDS"),
aes(x = 1 - 0.2, color = TYPE),
width = 0.3, # controls horizontal spread
varwidth = FALSE, # fixed width
groupOnX = FALSE,
size = 0.01,
alpha = 1
) +
geom_quasirandom_rast(
data = . %>% filter(TYPE == "CLUSTER"),
aes(x = 1 , color = TYPE),
width = 0.3,
varwidth = FALSE,
groupOnX = FALSE,
size = 0.1,
alpha = 1
) +
facet_row(~BIN)+
scale_color_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "CLUSTER" = "#e6a025" )) + # Add this line
# scale_y_log10()+
coord_cartesian( ylim = c(0,25))+
theme_cowplot(12)+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
axis.line.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_blank()
)+
{}
ggsave("flamenco_vs_UTR_vs_CDS_nucleotide_content_per_bin.UU.pdf", width = 8.5, height = 5)
TABLEfilt_Y %>%
filter(UNIQinclCLUSTERuniq > 50 ,RNAseq_RPKM>0, TTseq_RPKM>0)%>%
pivot_longer(cols=starts_with("NUC_T"), names_to = "NUC", values_to = "VAL")%>%
mutate(
TYPE = factor(TYPE, levels = c("CDS","UTR","CLUSTER")),
BIN= as.numeric(BIN)
)%>%
ggplot(aes(y=VAL))+
geom_quasirandom_rast(
data = . %>% filter(TYPE == "UTR"),
aes(x = 1+0 + 0.2, color = TYPE),
width = 0.3,
varwidth = FALSE,
groupOnX = FALSE,
size = 0.01,
alpha = 1
) +
geom_quasirandom_rast(
data = . %>% filter(TYPE == "CDS"),
aes(x = 1 - 0.2, color = TYPE),
width = 0.3, # controls horizontal spread
varwidth = FALSE, # fixed width
groupOnX = FALSE,
size = 0.01,
alpha = 1
) +
geom_quasirandom_rast(
data = . %>% filter(TYPE == "CLUSTER"),
aes(x = 1 , color = TYPE),
width = 0.3,
varwidth = FALSE,
groupOnX = FALSE,
size = 0.01,
alpha = 1
) +
stat_summary(
data = . %>% filter(TYPE == "CLUSTER"),
aes(x = 1, y = VAL),
fun = median,
geom = "crossbar",
width = 0.5,
fatten = 1.5,
color = "red",
inherit.aes = FALSE
)+
stat_summary(
data = . %>% filter(TYPE == "UTR"),
aes(x = 1+0.2, y = VAL),
fun = median,
geom = "crossbar",
width = 0.5,
fatten = 1.5,
color = "red",
inherit.aes = FALSE
)+
stat_summary(
data = . %>% filter(TYPE == "CDS"),
aes(x = 1-0.2, y = VAL),
fun = median,
geom = "crossbar",
width = 0.5,
fatten = 1.5,
color = "red",
inherit.aes = FALSE
)+
facet_row(~BIN)+
scale_color_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "CLUSTER" = "#e6a025" )) + # Add this line
# scale_y_log10()+
# coord_cartesian( ylim = c(0,25))+
theme_cowplot(12)+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
axis.line.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_blank()
)+
{}
ggsave("flamenco_vs_UTR_vs_CDS_nucleotide_content_per_bin.U.pdf", width = 3, height = 5)
# Filter and prepare the data
TTdata <- TABLEfilt_CLUSTER %>%
filter(TTseq_RPKM > 1,fullLengthUNIQ >= 50) %>%
select(CHR,FBgn,POS, TTseq_RPKM)%>%
{}
# Fit a monotonic decreasing smooth curve using scam
scam_fit_TTseq <- scam(log10(TTseq_RPKM) ~ s(as.numeric(POS), bs = "mpd"), data = TTdata, sp=0.1)
plot(scam_fit_TTseq)
#predict on fake data
fakeX= data.frame(
# FBgn=c("flam-1_CLUSTER:!:1","flam-2_CLUSTER:!:1","flam-3_CLUSTER:!:1","flam-4_CLUSTER:!:1","flam-81_CLUSTER:!:1","flam-88_CLUSTER:!:1","flam-89_CLUSTER:!:1","flam-90_CLUSTER:!:1","flam-665_CLUSTER:!:1","flam-666_CLUSTER:!:1","flam-667_CLUSTER:!:1","flam-668_CLUSTER:!:1","flam-1164_CLUSTER:!:1","flam-1165_CLUSTER:!:1","flam-1166_CLUSTER:!:1","flam-1167_CLUSTER:!:1"),
# VAL=c(750,750,750,750,180,180,180,180,20,20,20,20,1,1,1,1)
# )%>%
FBgn=c("flam-1_CLUSTER:!:1","flam-2_CLUSTER:!:1","flam-3_CLUSTER:!:1","flam-4_CLUSTER:!:1","flam-81_CLUSTER:!:1","flam-88_CLUSTER:!:1","flam-89_CLUSTER:!:1","flam-90_CLUSTER:!:1","flam-665_CLUSTER:!:1","flam-666_CLUSTER:!:1","flam-667_CLUSTER:!:1","flam-668_CLUSTER:!:1","flam-1164_CLUSTER:!:1","flam-1165_CLUSTER:!:1","flam-1166_CLUSTER:!:1","flam-1167_CLUSTER:!:1"),
TTseq_RPKM=c(700,700,700,700,200,200,200,200,10,10,10,10,1.5,1.5,1.5,1.5)
)%>%
separate(FBgn, c("CLUSTERname","tilePOS"), sep=":!:", remove=FALSE,convert=TRUE)%>%
separate(CLUSTERname, c("CLUSTERname"), sep="_", remove=TRUE,convert=TRUE)%>%
separate(CLUSTERname, c("clusterID","POS"), sep="-", remove=FALSE,convert=TRUE)%>%
mutate(POS = (POS-1)*500+(tilePOS-1)*100)%>%
arrange(POS)
FAKEscam_fit_TTseq <- scam(log10(TTseq_RPKM) ~ s(as.numeric(POS), bs = "mpd"), data = fakeX, sp=0.1)
plot(FAKEscam_fit_TTseq)
# Add the fitted values to the data frame
TTdata = TABLEfilt_CLUSTER %>%
filter(TTseq_RPKM > 1,UNIQinclCLUSTERuniq >= 50)
TTdata$Fitted_VAL <- 10^predict(scam_fit_TTseq,newdata = data.frame(POS = TTdata$POS))
TTdata$Fitted_fakeVAL <- 10^predict(FAKEscam_fit_TTseq,newdata = data.frame(POS = TTdata$POS))
# Plot the original data and the fitted curve
p=TTdata %>%
ggplot( aes(x = as.numeric(POS), y = TTseq_RPKM)) +
geom_point(data= . %>% filter( fullLengthUNIQ > 50),aes(color = "Measured TTseq-RPKM",text=paste(FBgn,TTseq_RPKM,sep="-"))) + # Original data points with legend label
geom_line(aes(y = Fitted_VAL, color = "Predicted TTseq-RPKM"), linetype = "dotted", size=1)+
# geom_line(aes(y = Fitted_fakeVAL), linetype = "dotted",
# size = 1, color="orange") + # Fitted curve with legend label
labs(title = "Monotonic Decreasing Curve Fit",
x = "Position on Flamenco",
y = "TTseq RPKM",
color = element_blank(),
linetype = element_blank()) + # Add legend title
theme_cowplot(12) +
scale_color_manual(values = c("Measured TTseq-RPKM" = "black","Predicted TTseq-RPKM" = "orange")) + # Custom colors
scale_linetype_manual(values = c("Predicted TTseq-levels" = "dotted"))+ # Custom linetype
theme(
#legend inside plotting area
legend.position = c(0.6, 0.8)
)
print(p)
pp=ggplotly(p, tooltip="text")
pp
ggsave("flamenco_TTseq_fit.pdf", p, width = 8, height = 5)
# Plot the original data and the fitted curve
p=ggplot(TTdata, aes(x = as.numeric(POS), y = TTseq_RPKM)) +
geom_point(data= . %>% filter( fullLengthUNIQ > 50),aes(color = "Measured TTseq-RPKM",text=paste(FBgn,TTseq_RPKM,sep="-"))) + # Original data points with legend label
geom_line(aes(y = Fitted_VAL, color = "Predicted TTseq-RPKM"), linetype = "dotted", size = 1) + # Fitted curve with legend label
# geom_line(aes(y = Fitted_fakeVAL), linetype = "dotted",
# size = 1, color="orange") + # Fitted curve with legend label
scale_y_log10()+
labs(title = "Monotonic Decreasing Curve Fit",
x = "Position on Flamenco",
y = "log10 - TTseq RPKM",
color = element_blank(),
linetype = element_blank()) + # Add legend title
theme_cowplot(12) +
scale_color_manual(values = c("Measured TTseq-RPKM" = "black","Predicted TTseq-RPKM" = "orange")) + # Custom colors
scale_linetype_manual(values = c("Predicted TTseq-levels" = "dotted"))+ # Custom linetype
theme(
#legend inside plotting area
legend.position = c(0.6, 0.8)
)
print(p)
pp=ggplotly(p, tooltip="text")
pp
ggsave("flamenco_TTseq_fit.log10.pdf", p, width = 8, height = 5)
# Filter and prepare the data
RNAdata <- TABLEfilt_CLUSTER %>%
filter(RNAseq_RPKM > 1, POS>2500 ,fullLengthUNIQ >= 50 ) %>%
select(CHR,FBgn,POS, RNAseq_RPKM)%>%
{}
# Fit a monotonic decreasing smooth curve using scam
library(scam)
scam_fit_RNAseq <- scam(log10(RNAseq_RPKM) ~ s(as.numeric(POS), bs = "mpd"), data = RNAdata, sp=0.1)
plot(scam_fit_RNAseq)
#predict on fake data
fakeX= data.frame(
# FBgn=c("flam-1_CLUSTER:!:1","flam-2_CLUSTER:!:1","flam-3_CLUSTER:!:1","flam-4_CLUSTER:!:1","flam-81_CLUSTER:!:1","flam-88_CLUSTER:!:1","flam-89_CLUSTER:!:1","flam-90_CLUSTER:!:1","flam-665_CLUSTER:!:1","flam-666_CLUSTER:!:1","flam-667_CLUSTER:!:1","flam-668_CLUSTER:!:1","flam-1164_CLUSTER:!:1","flam-1165_CLUSTER:!:1","flam-1166_CLUSTER:!:1","flam-1167_CLUSTER:!:1"),
# VAL=c(750,750,750,750,180,180,180,180,20,20,20,20,1,1,1,1)
# )%>%
FBgn=c("flam-1_CLUSTER:!:1","flam-2_CLUSTER:!:1","flam-3_CLUSTER:!:1","flam-4_CLUSTER:!:1","flam-81_CLUSTER:!:1","flam-88_CLUSTER:!:1","flam-89_CLUSTER:!:1","flam-90_CLUSTER:!:1","flam-665_CLUSTER:!:1","flam-666_CLUSTER:!:1","flam-667_CLUSTER:!:1","flam-668_CLUSTER:!:1","flam-1164_CLUSTER:!:1","flam-1165_CLUSTER:!:1","flam-1166_CLUSTER:!:1","flam-1167_CLUSTER:!:1"),
RNAseq_RPKM=c(400,400,400,400,90,90,90,90,7,7,7,7,1,1,1,1)
)%>%
separate(FBgn, c("CLUSTERname","tilePOS"), sep=":!:", remove=FALSE,convert=TRUE)%>%
separate(CLUSTERname, c("CLUSTERname"), sep="_", remove=TRUE,convert=TRUE)%>%
separate(CLUSTERname, c("clusterID","POS"), sep="-", remove=FALSE,convert=TRUE)%>%
mutate(POS = (POS-1)*500+(tilePOS-1)*100)%>%
arrange(POS)
FAKEscam_fit_RNAseq <- scam(log10(RNAseq_RPKM) ~ s(as.numeric(POS), bs = "mpd"), data = fakeX, sp=0.1)
plot(FAKEscam_fit_RNAseq)
# Add the fitted values to the data frame
RNAdata = TABLEfilt_CLUSTER %>%
filter(RNAseq_RPKM > 1,UNIQinclCLUSTERuniq >= 50)
RNAdata$Fitted_VAL <- 10^predict(scam_fit_RNAseq,newdata = data.frame(POS = RNAdata$POS))
RNAdata$Fitted_fakeVAL <- 10^predict(FAKEscam_fit_RNAseq,newdata = data.frame(POS = RNAdata$POS))
# Plot the original data and the fitted curve
p=ggplot(RNAdata, aes(x = as.numeric(POS), y = RNAseq_RPKM)) +
geom_point(data= . %>% filter( fullLengthUNIQ > 50, POS>5000 ),aes(color = "Measured RNAseq-RPKM",text=paste(FBgn,RNAseq_RPKM,sep="-"))) + # Original data points with legend label
geom_line(aes(y = Fitted_VAL, color = "Predicted RNAseq-RPKM"), linetype = "dotted", size = 1) + # Fitted curve with legend label
labs(title = "Monotonic Decreasing Curve Fit",
x = "Position on Flamenco",
y = "RNAseq RPKM",
color = element_blank(),
linetype = element_blank()) + # Add legend title
theme_cowplot(12) +
scale_color_manual(values = c("Measured RNAseq-RPKM" = "black","Predicted RNAseq-RPKM" = "orange")) + # Custom colors
scale_linetype_manual(values = c("Predicted RNAseq-RPKM" = "dotted"))+ # Custom linetype
theme(
#legend inside plotting area
legend.position = c(0.6, 0.8)
)
print(p)
pp=ggplotly(p, tooltip="text")
pp
ggsave("flamenco_RNAseq_fit.pdf", p, width = 8, height = 5)
# Plot the original data and the fitted curve
p=ggplot(RNAdata, aes(x = as.numeric(POS), y = RNAseq_RPKM)) +
geom_point(data= . %>% filter( fullLengthUNIQ > 50, POS>5000 ),aes(color = "Measured RNAseq-RPKM",text=paste(FBgn,RNAseq_RPKM,sep="-"))) + # Original data points with legend label
geom_line(aes(y = Fitted_VAL, color = "Predicted RNAseq-RPKM"), linetype = "dotted", size = 1) + # Fitted curve with legend label
scale_y_log10()+
labs(title = "Monotonic Decreasing Curve Fit",
x = "Position on Flamenco",
y = "log10 - RNAseq RPKM",
linetype = element_blank()) + # Add legend title
theme_cowplot(12) +
scale_color_manual(values = c("Measured RNAseq-RPKM" = "black","Predicted RNAseq-RPKM" = "orange")) + # Custom colors
scale_linetype_manual(values = c("Predicted RNAseq-RPKM" = "dotted"))+ # Custom linetype
theme(
#legend inside plotting area
legend.position = c(0.6, 0.8)
)
print(p)
pp=ggplotly(p, tooltip="text")
pp
ggsave("flamenco_RNAseq_fit.log10.pdf", p, width = 8, height = 5)
TABLEfilt_CLUSTER_PRED = TABLEfilt_CLUSTER
TABLEfilt_CLUSTER_PRED %>%
filter(UNIQinclCLUSTERuniq > 80)%>%
ggplot(aes(POS,sRNAdata_average_WT ))+
geom_point()
new_predictions_TTseq <- 10^predict(FAKEscam_fit_TTseq, newdata = data.frame(POS = TABLEfilt_CLUSTER_PRED$POS))
TABLEfilt_CLUSTER_PRED <- cbind(TABLEfilt_CLUSTER_PRED, TTseq_predicted_FAKE = new_predictions_TTseq)
new_predictions_RNAseq <- 10^predict(FAKEscam_fit_RNAseq, newdata = data.frame(POS = TABLEfilt_CLUSTER_PRED$POS))
TABLEfilt_CLUSTER_PRED <- cbind(TABLEfilt_CLUSTER_PRED, RNAseq_predicted_FAKE = new_predictions_RNAseq)
new_predictions_TTseq <- 10^predict(scam_fit_TTseq, newdata = data.frame(POS = TABLEfilt_CLUSTER_PRED$POS))
TABLEfilt_CLUSTER_PRED <- cbind(TABLEfilt_CLUSTER_PRED, TTseq_predicted = new_predictions_TTseq)
new_predictions_RNAseq <- 10^predict(scam_fit_RNAseq, newdata = data.frame(POS = TABLEfilt_CLUSTER_PRED$POS))
TABLEfilt_CLUSTER_PRED <- cbind(TABLEfilt_CLUSTER_PRED, RNAseq_predicted = new_predictions_RNAseq)
TABLEfilt_CLUSTER_PRED %>%
ggplot(aes(x=RNAseq_RPKM, y=RNAseq_predicted, color=POS))+
geom_point()+
geom_abline(intercept = 0, slope = 1)+
scale_x_log10()+
scale_y_log10()+
theme_bw()+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text = element_text(size = 12),
axis.title =element_text(size = 12),
aspect.ratio = 1
)
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 1046 rows containing missing values or values outside the scale range
## (`geom_point()`).
TABLEfilt_CLUSTER_PRED %>%
separate(FBgn, c("CLUSTERname","tilePOS"), sep=":!:", remove=FALSE,convert=TRUE)%>%
separate(CLUSTERname, c("CLUSTERname"), sep="_", remove=TRUE,convert=TRUE)%>%
separate(CLUSTERname, c("clusterID","POS"), sep="-", remove=FALSE,convert=TRUE)%>%
mutate(POS = (POS-1)*500+(tilePOS-1)*100)%>%
filter(POS<650000)%>%
select(POS,RNAseq_RPKM, RNAseq_predicted, TTseq_RPKM, TTseq_predicted, RNAseq_predicted_FAKE, TTseq_predicted_FAKE, FBgn,fullLengthUNIQ,nPOS) %>%
pivot_longer(-c(POS,FBgn,fullLengthUNIQ))%>%
mutate(
CLASS= case_when(
str_detect(name,"RNAseq") ~ "RNAseq",
str_detect(name,"TTseq") ~ "TTseq",
TRUE ~ "Unknown"
)
)%>%
ggplot(aes(x=POS, y=value, color=name))+
geom_point(data=.%>%filter(str_detect(name,"RPKM")&fullLengthUNIQ>50),size=0.2)+
geom_smooth(data=.%>%filter(str_detect(name,"RPKM")&fullLengthUNIQ>50))+
geom_smooth(data=.%>%filter(!str_detect(name,"RPKM")))+
facet_row(~CLASS)+
scale_y_log10()+
# scale_y_continuous(limits=c(0,1000))+
theme_bw()+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text = element_text(size = 12),
axis.title =element_text(size = 12),
aspect.ratio = 1
)
## Warning: Expected 1 pieces. Additional pieces discarded in 2358 rows [1, 2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
#introduce new column into tablefilt with either predicted or original TTseq_RPKM
TABLEfilt_CLUSTER_PRED <- TABLEfilt_CLUSTER_PRED %>%
mutate(
TTseq_original = TTseq_RPKM,
TTseq_RPKM = case_when(
# str_detect(FBgn, "flam") ~ log10(TTseq_predicted_FAKE),
str_detect(FBgn, "flam") ~ TTseq_predicted,
TRUE ~ TTseq_RPKM
),
RNAseq_original = RNAseq_RPKM,
RNAseq_RPKM = case_when(
# str_detect(FBgn, "flam") ~ log10(RNAseq_predicted_FAKE),
str_detect(FBgn, "flam") ~ RNAseq_predicted,
TRUE ~ RNAseq_RPKM
)
)%>%
{}
h2o.init(max_mem_size = "20g", nthreads = -1)
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 days 13 hours
## H2O cluster timezone: Europe/Vienna
## H2O data parsing timezone: UTC
## H2O cluster version: 3.46.0.7
## H2O cluster version age: 10 months and 12 days
## H2O cluster name: H2O_started_from_R_dominik.handler_fuo893
## H2O cluster total nodes: 1
## H2O cluster total memory: 17.70 GB
## H2O cluster total cores: 11
## H2O cluster allowed cores: 11
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.2 (2023-10-31)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (10 months and 12 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
h2o.removeAll()
minLIMIT=-0.5
maxLIMIT=5
for(currMODEL in c("model_ablation_study_all_vari","model_ablation_study_no_nuc","model_ablation_study_no_tt")){
# for(currMODEL in c("model_ablation_study_all_vari")){
print(currMODEL)
yb_ml <- h2o.loadModel(paste("../Fig.3/",currMODEL,sep=""))
TABLEfilt_CLUSTER_PRED_sRNA=TABLEfilt_CLUSTER_PRED %>%
filter(UNIQinclCLUSTERuniq > 50 , POS<600000, POS>5000, sRNAdata_average_WT>0,RNAseq_RPKM>0, TTseq_RPKM>0)
TABLEfilt_CLUSTER_PRED_sRNA = TABLEfilt_CLUSTER_PRED_sRNA %>%
as.h2o()
model = yb_ml
pred_FULL <- h2o.predict(model, TABLEfilt_CLUSTER_PRED_sRNA)
#calculate correlation between predicted and observed values
pred_dataFIN= as_tibble(TABLEfilt_CLUSTER_PRED_sRNA) %>%
bind_cols(PRED=as.vector(pred_FULL))%>%
mutate(
sRNAdata_average_WT = log10(sRNAdata_average_WT),
PRED = PRED
)
#plot scatter
#calculate correlation
correlation <- cor.test(pred_dataFIN$PRED, pred_dataFIN$sRNAdata_average_WT)
#pseudo r squared of srna data vs pred
correlation = lm(pred_dataFIN$PRED ~ pred_dataFIN$sRNAdata_average_WT) %>% summary() %>% .$r.squared
#calculate lin's concordance correlation coefficient
library(DescTools)
CCC_value = CCC(pred_dataFIN$sRNAdata_average_WT, pred_dataFIN$PRED)
#calculate lm for the offset calculation
#here I switch to Real ~ Pred
fit <- lm( sRNAdata_average_WT ~PRED , data = pred_dataFIN)
intercept <- fit$coefficients[1]
slope <- fit$coefficients[2]
#calculate data to plot model in ggplot
newdata <- data.frame(PRED = seq(min(pred_dataFIN$PRED), max(pred_dataFIN$PRED), length.out = 100))
newdata$x_pred <- predict(fit, newdata)
#only pearson correlation
PEAR=cor.test( pred_dataFIN$PRED, pred_dataFIN$sRNAdata_average_WT, method = "pearson")%>% .$estimate
SPEAR=SpearmanRho( pred_dataFIN$PRED, pred_dataFIN$sRNAdata_average_WT)
#plot predictions vs real data
p=pred_dataFIN %>%
ggplot(aes(x=sRNAdata_average_WT,y=PRED, label=paste(FBgn,RNAseq_RPKM,TTseq_RPKM,sep="_") )) +
geom_point_rast(size=1.5,alpha=0.3,shape = 16, stroke = 0)+
geom_density_2d(color="black",alpha=0.5, size=0.6)+
#add the correlation coefficient to the plot
geom_abline(intercept = 0, slope = 1)+
geom_line(data=newdata, aes(y = PRED, x = x_pred,label=PRED), color = "red", linetype = "dotted", linewidth=2 ) +
scale_x_continuous(limits=c(minLIMIT,maxLIMIT))+
scale_y_continuous(limits=c(minLIMIT,maxLIMIT))+
# # facet_wrap(~nPOS)+
annotation_logticks(sides = "lb", outside=FALSE) +
annotate("text", x = 0.5, y = 3,
label = paste("R2: ", round(correlation, 2),"\n","CCC=",round(unlist(CCC_value[1])[1],2),"\n","PEAR=",round(PEAR,2),"\n","SPEAR=",round(SPEAR,2),"\n",paste("n=",nrow(pred_dataFIN),sep=""),sep=""),hjust = 0)+
theme_bw()+
scale_color_viridis_d()+
theme(
panel.grid.major = element_blank(),
axis.text = element_text(size = 12),
axis.title =element_text(size = 12),
panel.grid.minor = element_blank(),
aspect.ratio = 1
)
print(p)
ggsave(paste("Prediction_vs_real.large.cluster.",currMODEL,".log.pdf", sep=""),p , width = 6, height = 6, dpi = 300)
#determine calibration curve
calibration_curve <- pred_dataFIN %>%
mutate(bin = cut_interval(PRED, n=20)) %>%
group_by(bin) %>%
summarize(
mean_predicted_probability = mean(PRED),
observed_frequency = mean(sRNAdata_average_WT)
)
# Plot calibration curve
p=calibration_curve %>%
ggplot(aes(y=mean_predicted_probability, x=observed_frequency))+
geom_point()+
geom_smooth(method = "lm", se = TRUE, color = "blue") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(title = "Calibration Curve", x = "Mean Observed sRNA count", y = "Mean Predicted sRNA count") +
theme_cowplot(14)+
scale_x_log10(limits=c(0.7,5))+
scale_y_log10(limits=c(0.7,5))+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text = element_text(size = 12),
axis.title =element_text(size = 12),
aspect.ratio = 1
)
print(p)
###############################################################################################
#determine fit of the data and calibration
#calibrate model according to the determined value
pred_dataFIN = pred_dataFIN %>%
mutate(PRED_scaled = intercept + PRED)
#calculate correlation
correlation <- cor.test( pred_dataFIN$PRED_scaled,pred_dataFIN$sRNAdata_average_WT)
#pseudo r squared of srna data vs pred
correlation = lm(pred_dataFIN$PRED_scaled ~ pred_dataFIN$sRNAdata_average_WT) %>% summary() %>% .$r.squared
#calculate lin's concordance correlation coefficient
library(DescTools)
CCC_value = CCC(pred_dataFIN$PRED_scaled,pred_dataFIN$sRNAdata_average_WT)
fit_calibrated <- lm( sRNAdata_average_WT ~ PRED_scaled , data = pred_dataFIN)
intercept_calibrated <- fit_calibrated$coefficients[1]
slope_calibrated <- fit_calibrated$coefficients[2]
#calculate data to plot model in ggplot
newdata_calibrated <- data.frame(PRED_scaled = seq(min(pred_dataFIN$PRED_scaled), max(pred_dataFIN$PRED_scaled), length.out = 100))
newdata_calibrated$x_pred <- predict(fit_calibrated, newdata_calibrated)
#only pearson correlation
PEAR=cor.test(pred_dataFIN$sRNAdata_average_WT, pred_dataFIN$PRED_scaled, method = "pearson")%>% .$estimate
SPEAR=SpearmanRho(pred_dataFIN$sRNAdata_average_WT, pred_dataFIN$PRED_scaled)
#plot predictions vs real data
p=pred_dataFIN %>%
ggplot(aes(x=sRNAdata_average_WT,y=PRED_scaled, label=paste(FBgn,RNAseq_RPKM,TTseq_RPKM,sep="_"))) +
geom_point_rast(size=1.5,alpha=0.3,shape = 16, stroke = 0)+
geom_density_2d(color="black",alpha=0.5, size=0.6)+
# geom_smooth(method="lm")+
#add the correlation coefficient to the plot
geom_abline(intercept = 0, slope = 1)+
geom_line(data=newdata_calibrated, aes(y = PRED_scaled, x = x_pred,label=PRED_scaled), color = "red", linetype = "dotted", linewidth=2 ) +
scale_x_continuous(limits=c(minLIMIT,maxLIMIT))+
scale_y_continuous(limits=c(minLIMIT,maxLIMIT))+
# # facet_wrap(~nPOS)+
annotation_logticks(sides = "lb", outside=FALSE) +
annotate("text", x = 0.9, y = 3,
label = paste("R2: ", round(correlation, 3),"\n","CCC=",round(unlist(CCC_value[1])[1],2),"\n","PEAR=",round(PEAR,2),"\n","SPEAR=",round(SPEAR,2),"\n",paste("n=",nrow(pred_dataFIN),sep=""),sep=""),hjust = 0)+
theme_bw()+
scale_color_viridis_d()+
theme(
panel.grid.major = element_blank(),
axis.text = element_text(size = 12),
axis.title =element_text(size = 12),
panel.grid.minor = element_blank(),
aspect.ratio = 1
)
print(p)
ggsave(paste("Prediction_vs_real.large.cluster.rescaled.",currMODEL,".log.pdf",sep=""),p , width = 6, height = 6, dpi = 300)
ggplotly(p, tooltip = c("label"))
###############################################################################################
#convert to real number space
#calibrate model according to the determined value
pred_dataFIN_real = pred_dataFIN %>%
mutate(
PRED_scaled = 10^PRED_scaled,
sRNAdata_average_WT = 10^sRNAdata_average_WT)
#calculate correlation
correlation <- cor.test( pred_dataFIN_real$PRED_scaled,pred_dataFIN_real$sRNAdata_average_WT)
#pseudo r squared of srna data vs pred
correlation = lm(pred_dataFIN_real$PRED_scaled ~ pred_dataFIN_real$sRNAdata_average_WT) %>% summary() %>% .$r.squared
#calculate lin's concordance correlation coefficient
library(DescTools)
CCC_value = CCC(pred_dataFIN_real$PRED_scaled,pred_dataFIN_real$sRNAdata_average_WT)
#only pearson correlation
PEAR=cor.test(pred_dataFIN_real$sRNAdata_average_WT, pred_dataFIN_real$PRED_scaled, method = "pearson")%>% .$estimate
SPEAR=SpearmanRho(pred_dataFIN_real$sRNAdata_average_WT, pred_dataFIN_real$PRED_scaled)
#plot predictions vs real data
p=pred_dataFIN_real %>%
ggplot(aes(x=sRNAdata_average_WT,y=PRED_scaled, label=paste(FBgn,RNAseq_RPKM,TTseq_RPKM,sep="_"))) +
geom_point_rast(size=0.5,alpha=0.3)+
geom_density_2d(color="darkgrey",alpha=0.5)+
# geom_smooth(method="lm")+
#add the correlation coefficient to the plot
geom_abline(intercept = 0, slope = 1)+
scale_x_log10(limits=c(1,100000))+
scale_y_log10(limits=c(1,100000))+
# # facet_wrap(~nPOS)+
annotate("text", x = 2, y = 5000,
label = paste("R2: ", round(correlation, 2),"\n","CCC=",round(unlist(CCC_value[1])[1],2),"\n","PEAR=",round(PEAR,2),"\n","SPEAR=",round(SPEAR,2),"\n",paste("n=",nrow(pred_dataFIN),sep=""),sep=""),hjust = 0)+
theme_bw()+
scale_color_viridis_d()+
theme(
panel.grid.major = element_blank(),
axis.text = element_text(size = 12),
axis.title =element_text(size = 12),
panel.grid.minor = element_blank()
)
print(p)
ggsave(paste("Prediction_vs_real.large.cluster.rescaled.",currMODEL,".pdf",sep=""),p , width = 6, height = 6, dpi = 300)
ggplotly(p, tooltip = c("label"))
###############################################################################################
p=pred_dataFIN_real %>%
pivot_longer(c(sRNAdata_average_WT,PRED_scaled), values_to = "VALUE", names_to = "LIB") %>%
ggplot(aes(x=POS, y=VALUE, color=LIB, text=FBgn))+
geom_point(size=0.2)+
scale_y_log10()+
facet_col(~TEorientation_nucTILE)
print(p)
ggplotly(p, tooltip = c("text"))
p=pred_dataFIN_real %>%
pivot_longer(c(sRNAdata_average_WT,PRED_scaled), values_to = "VALUE", names_to = "LIB") %>%
ggplot(aes(x=POS, y=VALUE, color=LIB))+
# geom_point(size=0.2)+
geom_smooth(method = "loess", span = 0.2, se = TRUE) +
scale_y_log10()
print(p)
ggplotly(p, tooltip = c("text"))
p=pred_dataFIN_real %>%
mutate(RATIO= PRED_scaled/sRNAdata_average_WT)%>%
ggplot(aes(x=POS, y=RATIO))+
geom_point(aes(color=TEorientation_nucTILE),size=0.2)+
geom_smooth(method = "loess", span = 0.3, se = FALSE) +
scale_y_log10()
print(p)
p=pred_dataFIN_real %>%
mutate(
RATIO = PRED_scaled/sRNAdata_average_WT
)%>%
ggplot(aes(x=POS, y=RNAseq_RPKM, text=FBgn))+
geom_point(aes(color=TEorientation_nucTILE),size=0.2)+
geom_point(aes(y=RNAseq_original), size=0.2)+
# scale_y_log10()+
# facet_col(~TEorientation_nucTILE)+
geom_hline(yintercept = 1, linetype = "dashed", color = "red")
print(p)
library(zoo)
library(dplyr)
library(ggplot2)
window_size <- 20
gap_threshold <- 5000 # Set this to the maximum allowed gap (in POS units) for continuous regions
# Prepare data with rolling average and group by contiguous regions
p = pred_dataFIN_real %>%
pivot_longer(
c(sRNAdata_average_WT, PRED_scaled),
values_to = "VALUE",
names_to = "LIB"
) %>%
arrange(LIB, POS) %>%
group_by(LIB) %>%
mutate(
ROLLING = rollmean(VALUE, k = window_size, fill = NA, align = "center"),
# Identify gaps
gap = c(0, diff(POS)),
region = cumsum(gap > gap_threshold)
) %>%
filter(!is.na(ROLLING)) %>%
ggplot(aes(x = POS, y = ROLLING, color = LIB)) +
# geom_point(size = 0.3, aes(text=paste(FBgn,diNUC_TT,RNAseq_RPKM,TTseq_RPKM))) +
geom_point(aes(y= VALUE,text=paste(FBgn,diNUC_TT,RNAseq_RPKM,TTseq_RPKM)), size = 0.1, alpha = 0.5) +
geom_line(data = function(df) df[!is.na(df$ROLLING), ], aes(y = ROLLING, group = interaction(region,LIB))) +
# geom_smooth()+
scale_color_scico_d(palette = "batlow", begin = 0.2, end = 0.8) +
labs(
title = "Rolling Average of sRNA and Predicted sRNA Levels",
x = "Position on Flamenco",
y = "Rolling Average of sRNA Levels"
) +
theme_cowplot(14) +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text = element_text(size = 12),
axis.title = element_text(size = 12),
legend.position= c(0.4, 0.9), # Position the legend inside the plot area
aspect.ratio = 1
)+
{}
print(p)
ggsave(paste("flamenco_sRNA_predicted_rolling-smooth.",currMODEL,".pdf",sep=""), p, width = 8, height = 5)
p = p+
scale_y_log10()
print(p)
ggsave(paste("flamenco_sRNA_predicted_rolling-smooth.",currMODEL,".log.pdf",sep=""), p, width = 8, height = 5)
}
## [1] "model_ablation_study_all_vari"
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset is missing column 'UTR_LENGTH': substituting in a column of NaN
## Warning in geom_line(data = newdata, aes(y = PRED, x = x_pred, label = PRED), :
## Ignoring unknown aesthetics: label
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning in transformation$transform(x): NaNs produced
## Warning in scale_y_log10(limits = c(0.7, 5)): log-10 transformation introduced
## infinite values.
## Warning in transformation$transform(x): NaNs produced
## Warning in scale_y_log10(limits = c(0.7, 5)): log-10 transformation introduced
## infinite values.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning in geom_line(data = newdata_calibrated, aes(y = PRED_scaled, x =
## x_pred, : Ignoring unknown aesthetics: label
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLogticks() has yet to be implemented in plotly.
## If you'd like to see this geom implemented,
## Please open an issue with your example code at
## https://github.com/ropensci/plotly/issues
## Warning: Aspect ratios aren't yet implemented, but you can manually set a
## suitable height/width
## Warning: Aspect ratios aren't yet implemented, but you can manually set a
## suitable height/width
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_density2d()`).
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_density2d()`).
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_density2d()`).
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 745 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning in geom_point(aes(y = VALUE, text = paste(FBgn, diNUC_TT, RNAseq_RPKM,
## : Ignoring unknown aesthetics: text
## [1] "model_ablation_study_no_nuc"
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset is missing column 'UTR_LENGTH': substituting in a column of NaN
## Warning in geom_line(data = newdata, aes(y = PRED, x = x_pred, label = PRED), :
## Ignoring unknown aesthetics: label
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning in geom_line(data = newdata_calibrated, aes(y = PRED_scaled, x =
## x_pred, : Ignoring unknown aesthetics: label
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLogticks() has yet to be implemented in plotly.
## If you'd like to see this geom implemented,
## Please open an issue with your example code at
## https://github.com/ropensci/plotly/issues
## Warning: Aspect ratios aren't yet implemented, but you can manually set a
## suitable height/width
## Warning: Aspect ratios aren't yet implemented, but you can manually set a
## suitable height/width
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_density2d()`).
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_density2d()`).
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_density2d()`).
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 745 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning in geom_point(aes(y = VALUE, text = paste(FBgn, diNUC_TT, RNAseq_RPKM,
## : Ignoring unknown aesthetics: text
## [1] "model_ablation_study_no_tt"
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset is missing column 'UTR_LENGTH': substituting in a column of NaN
## Warning in geom_line(data = newdata, aes(y = PRED, x = x_pred, label = PRED), :
## Ignoring unknown aesthetics: label
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density2d()`).
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density2d()`).
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning in transformation$transform(x): NaNs produced
## Warning in scale_y_log10(limits = c(0.7, 5)): log-10 transformation introduced
## infinite values.
## Warning in transformation$transform(x): NaNs produced
## Warning in scale_y_log10(limits = c(0.7, 5)): log-10 transformation introduced
## infinite values.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning in geom_line(data = newdata_calibrated, aes(y = PRED_scaled, x =
## x_pred, : Ignoring unknown aesthetics: label
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLogticks() has yet to be implemented in plotly.
## If you'd like to see this geom implemented,
## Please open an issue with your example code at
## https://github.com/ropensci/plotly/issues
## Warning: Aspect ratios aren't yet implemented, but you can manually set a
## suitable height/width
## Warning: Aspect ratios aren't yet implemented, but you can manually set a
## suitable height/width
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_density2d()`).
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_density2d()`).
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_density2d()`).
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 745 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning in geom_point(aes(y = VALUE, text = paste(FBgn, diNUC_TT, RNAseq_RPKM,
## : Ignoring unknown aesthetics: text
############################################################################################### ############################################################################################### ############################################################################################### # ########################### plot nuc content over flamenco sense region ############################################### ############################################################################################### ############################################################################################### ############################################################################################### # nuc-content
RAW = read_tsv("nuc_content_flamenco-sense.txt", col_names = TRUE)%>%
{}
plotTABLE = RAW %>%
pivot_longer( cols = starts_with('NUC_'), names_to = 'Nucleotide', values_to = 'Percentage') %>%
filter(Nucleotide == 'NUC_T') %>%
group_by(SEQ, Nucleotide) %>%
arrange(POS) %>%
mutate(
Percentage_smooth = rollmean(Percentage, k = 100, fill = NA, align = "center")
) %>%
ungroup()
END=plotTABLE %>%
select(POS)%>%
max()
p = plotTABLE %>%
ggplot( aes(x = POS, y = Percentage_smooth)) +
geom_line(size=0.3) +
geom_vline(xintercept = c(1600,END-1600), linetype="dashed", color = "red", size=0.3)+
labs(title = 'Nucleotide Content Along Sequence', x = 'Position', y = 'Percentage (%)') +
# facet_grid(SEQ~Nucleotide, scales="free_x", space = "free")+
theme_cowplot()+
theme(
panel.grid.major.y = element_line(color = 'black', linetype = 'dashed', size = 0.3),
panel.grid.minor.y = element_line(color = 'grey', linetype = 'dashed', size = 0.3),
)
print(p)
ggsave(filename = "nuc-content-flam.U.pdf", plot = p, width = 10, height = 3)